#Source Code File 2:
#Code for NMDS plot and random forest anaysis

#####
# CHC Random Forest
#####

#####
# Inport (Aggregated)
#####
library(scales)
library(vegan)
library(ggplot2)
library(plyr)
library(dplyr)
library(phyloseq)
library(randomForest)
library(rpart)
library(ggpubr)

theme_set(theme_bw(base_size = 12)+theme(panel.grid.major = element_blank(), 
                                         panel.grid.minor = element_blank()))

CombinedCHCData<-read.csv("FinalCHC&Desiccation50Spp.csv",header=T)


head(CombinedCHCData)
CombinedCHCData <- CombinedCHCData %>% select(-X)  

rownames(CombinedCHCData)<-CombinedCHCData$Label
Metadata<-CombinedCHCData[,1:7] # Pull out metadata categories
head(Metadata)
Metadata$Sex <- gsub('Female', 'F', Metadata$Sex)
Metadata$Sex <- gsub('Male', 'M', Metadata$Sex)
Metadata$Sex<-as.factor(Metadata$Sex)

CHCs<-CombinedCHCData[,8:56] # Create a new dataframe with 

# CHCs only

head(CHCs)

CHCNames<-colnames(CHCs)


TAX<-tax_table(as.matrix(CHCNames))
#names
#TAX
colnames(TAX)=("Hydrocarbon")

Matrix<-otu_table(t(CHCs),taxa_are_rows=T)
sample_names(Matrix)==Metadata$Label #Check if sample names and labels match
taxa_names(TAX)=row.names(Matrix)




Metadata$SpeciesW = factor(Metadata$SpeciesW, levels = (c("D.melanogaster", "D.simulans" ,"D.mauritiana","D.erecta","D.tessieri",
                                                          "D.yakuba","D.suzukii","D.biarmipes","D.ficusphila","D.gunungcola",
                                                          "D.elegans","D.serrata","D.birchii","D.kikkawai","D.ananassae",
                                                          "D.pseudoananassae","D.bipectinata","D.pseudoobscura","D.persimilis",
                                                          "D.azteca","D.affinis","D.willistoni","D.paulistorum","D.equinoxialis",
                                                          "D.nebulosa","D.prosaltans","D.saltans","D.sturtevanti","D.albomicans",
                                                          "D.nasuta","D.sulfrigaster","D.immigrans","D.virilis","D.americana",
                                                          "D.novamexicana","D.lummei","D.littoralis","D.flavomontana","D.lacicola",
                                                          "D.borealis","D.repleta","D.mercatorum","D.buzzatii","D.mulleri","D.arizonae",
                                                          "D.mojavensis","S.rufifrons","S.lebanonensis","S.latifasciaeformis","C.procnemis")))

Metadata$SpeciesD = factor(Metadata$SpeciesD, levels = rev(c("Dmel","Dsim","Dmau","Dere","Dtei","Dyak","Dsuz","Dbia","Dfic","Dgun",
                                                             "Dele","Dser","Dbir","Dkik","Dana","Dpda","Dbip","Dpse","Dper",
                                                             "Dazt","Daff","Dwil","Dpau","Dequ",
                                                             "Dneb","Dpro","Dsal","Dstu","Dalb",
                                                             "Dnas","Dsul","Dimm","Dvir","Dame",
                                                             "Dnov","Dlum","Dlit","Dfla","Dlac",
                                                             "Dbor","Drep","Dmer","Dbuz","Dmul","Dari",
                                                             "Dmoj","Sruf","Sleb","Slat","Cpro")))

levels(Metadata$SpeciesW)

Sampdat=sample_data(Metadata)
sample_names(Sampdat)=Metadata$Label
JoinedTable<-phyloseq(TAX,Matrix,Sampdat)
JoinedTable


#####
# Predicting dessication
#####
library(ranger)
set.seed(147)
CHCs <- as.data.frame(t(otu_table(JoinedTable)))

head(CHCs)
CHCs$Label <- rownames(CHCs)
meta_sa <- CombinedCHCData %>% select(Label, Desiccation)
CHC <- merge(meta_sa, CHCs, by = 'Label')
head(CHC)
rownames(CHC)<-CHC$Label

head(CHC)

CHC <- CHC[,-1] #Remove label from predictors
head(CHC)

names(CHC) 


features<-setdiff(names(CHC),"Desiccation")
mTune<-tuneRF(
  x=CHC[features],
  y=CHC$Desiccation,
  ntreeTry = 500,
  stepFactor = 1,
  trace=T
  
)
mTune


set.seed(348)
mCHC <- randomForest(
  formula = Desiccation ~ .,
  data    = CHC,
  ntree= 1000, keep.inbag=T,importance=T,mtry=16)
#head(CHC)
print(mCHC)
sqrt(mCHC$mse[length(mCHC$mse)]) # RMSE at final tree


TreeFile<-data.frame(seq(1,1000,by=1),sqrt(mCHC$mse))
colnames(TreeFile)<-c("Tree","MSE")
head(TreeFile)
ErrorByTreeCHC<-ggplot(TreeFile,aes(x=Tree,y=MSE))+geom_line()+xlab("Tree")+ylab("RSME (CHC, Desiccation)")
ErrorByTreeCHC

set.seed(457)
OOBModel<-ranger(Desiccation ~ ., data = CHC,mtry=19,num.trees=1000,importance = "permutation")
OOBModel
sqrt(OOBModel$prediction.error) #RMSE = 4.79
OOBModel$r.squared # 0.8347


ImportanceOOB<-data.frame(CHC=names(OOBModel$variable.importance),Importance=OOBModel$variable.importance)
head(ImportanceOOB)
ImpOOBSorted <- arrange(ImportanceOOB, desc(Importance))
ImpOOBSorted
ImpOOBSorted$CHC <- factor(ImpOOBSorted$CHC, levels = ImpOOBSorted$CHC)

ggplot(ImpOOBSorted, aes(x = CHC, y = Importance)) +
  geom_bar(stat = "identity", fill = "indianred") +
  coord_flip() +
  ggtitle("Most important CHCs for modeling desiccation resistance")

RFimportancePlot<-ggplot(ImpOOBSorted, aes(x = CHC, y = Importance)) +
  geom_bar(stat = "identity", fill = "indianred") +
  coord_flip() +
  ggtitle("Most important CHCs for modeling desiccation resistance")

ggsave(filename = "RFimportancePlot.png", plot = RFimportancePlot, device="png",dpi = 600, width = 12, height = 16, units = 'cm')

names(OOBModel$variable.importance)
PredictionsAndResponseOOB<-data.frame(TrueDesiccation=CHC$Desiccation,
  PredictedDesiccation=OOBModel$predictions)


ggplot(PredictionsAndResponseOOB, aes(x=TrueDesiccation,y=PredictedDesiccation))+
geom_point()+geom_abline(intercept=0, slope=1, linetype=2)+
ylab("Predicted Desiccation (hrs, +/- SE)")+xlab("True Desiccation (hrs)") +
  annotate("text", label = "Var explained = 85.2%\n RMSE = 4.54", size = 6, x = 20, y = 45)

RFperformance.png<-ggplot(PredictionsAndResponseOOB, aes(x=TrueDesiccation,y=PredictedDesiccation))+
  geom_point()+geom_abline(intercept=0, slope=1, linetype=2)+
  ylab("Predicted Desiccation (hrs, +/- SE)")+xlab("True Desiccation (hrs)") 
ggsave(filename = "RFperformance.png", plot = RFperformance.png, device="png",dpi = 600, width = 12, height = 8, units = 'cm')

train.idx <- sample(nrow(CHC), .7 * nrow(CHC))
CHC.train <- CHC[train.idx, ]
CHC.test <- CHC[-train.idx, ]
rg.CHC <- ranger(Desiccation ~ ., data = CHC.train,mtry=19,num.trees = 1000)
pred.CHC <- predict(rg.CHC, data = CHC.test)

plot(pred.CHC$predictions~CHC.test$Desiccation)

sqrt(mean((OOBModel$predictions-CHC$Desiccation)^2)) # Calculating RMSE for OOB Error
# 4.539335
sqrt(mean((rg.CHC$predictions-CHC.train$Desiccation)^2)) # Calculating RMSE for training set
#5.009103

sqrt(mean((pred.CHC$predictions-CHC.test$Desiccation)^2)) # Calc RMSE for test set error
# 5.340195

PredictionsAndResponseTestTrain<-data.frame(PredictedDesiccation=pred.CHC$predictions,
                                            TrueDesiccation=CHC.test$Desiccation)
length(PredictionsAndResponseTestTrain$PredictedDesiccation)

rsq <- function (x, y) cor(x, y) ^ 2 #R2 function for vectors


ggplot(PredictionsAndResponseTestTrain, aes(x=TrueDesiccation,y=PredictedDesiccation))+
  geom_point()+geom_abline(intercept=0, slope=1, linetype=2) +
  annotate("text", label = "Test set (n = 164) RMSE = 5.34", size = 6, x = 20, y = 45)

RFtest.png<-ggplot(PredictionsAndResponseTestTrain, aes(x=TrueDesiccation,y=PredictedDesiccation))+
  geom_point()+geom_abline(intercept=0, slope=1, linetype=2) 
ggsave(filename = "RFtest.png", plot = RFtest.png, device="png",dpi = 600, width = 8, height = 8, units = 'cm')


#####
# PERMANOVA & NMDS Plot
#####


GPdist=phyloseq::distance(JoinedTable, "bray")

vare.mds= ordinate(JoinedTable, "NMDS",GPdist)

metadataEnvfitSampleData<-subset(sample_data(JoinedTable))
EnvFitMeta=data.frame(metadataEnvfitSampleData$Desiccation)
head(EnvFitMeta)
colnames(EnvFitMeta)<-c("  ")

ef =envfit(vare.mds, EnvFitMeta, permu=999)
ef

png(file="nmds2.png",
    width=400, height=400)
plot(vare.mds,display="sites",cex =1.5)
envplot=plot(ef, p.max = 0.05)
dev.off()

envplot=plot(ef, p.max = 0.05)
nmds.png <- plot(vare.mds,display="sites",cex =1.5)

ggsave(filename = "nmds.png", plot = nmds.png, device="png",dpi = 600, width = 12, height = 8, units = 'cm')
